library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggExtra) ; library(ggpubr)
library(biomaRt) ; library(sva) ; library(WGCNA) ; library(vsn) ; library(Biobase) ; library(GEOquery) ; library(limma)
library(dendextend) ; library(expss)
library(knitr) ; library(kableExtra)

Raw data



Load and annotate data


Dataset downloded from two different places:

  • Gene Expression matrix from Gemma

  • Metadata information from GSE with ID GSE102741


# LOAD DATA

# Expression data downliaded from GEMMA (https://gemma.msl.ubc.ca/expressionExperiment/showExpressionExperiment.html?id=11805)
datExpr = read.delim('./../Data/11805_GSE102741_expmat.data.txt.gz', comment.char='#')

datGenes = datExpr %>% dplyr::select(Probe, Sequence, GeneSymbol, GeneName, GemmaId, NCBIid) %>% 
           dplyr::rename('entrezgene' = NCBIid)
datExpr = datExpr %>% dplyr::select(-c(Probe, Sequence, GeneSymbol, GeneName, GemmaId, NCBIid))
colnames(datExpr) = sapply(colnames(datExpr), function(x) strsplit(x, '\\.')[[1]][3]) %>% unname
rownames(datExpr) = datGenes$entrezgene

# Metadata downloaded from GEO
GEO_data = getGEO('GSE102741', destdir='./../Data/')[[1]]

datMeta = GEO_data@phenoData@data %>% 
          mutate(Diagnosis = factor(ifelse(grepl('control', characteristics_ch1), 'CTL', 'ASD'), 
                                    levels = c('CTL','ASD')),
                 Age = substring(characteristics_ch1.4, 6) %>% as.numeric %>% round, 
                 Sex = `Sex:ch1`, 
                 Sample_ID = description, 
                 Ethnicity = substring(characteristics_ch1.6, 7),
                 title = gsub(' ', '', title)) %>%
          dplyr::select(title, geo_accession, Sample_ID, Diagnosis, Age, Sex, Ethnicity)
datMeta = datMeta[match(colnames(datExpr), datMeta$title),]



# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# NCBI biotype annotation
NCBI_biotype = read.csv('./../../../NCBI/Data/gene_biotype_info.csv') %>% 
               dplyr::rename('ensembl_gene_id'=Ensembl_gene_identifier, 'gene_biotype'=type_of_gene, 
                             'hgnc_symbol'=Symbol) %>% 
               mutate(gene_biotype = ifelse(gene_biotype=='protein-coding','protein_coding', gene_biotype))


gg_colour_hue = function(n) {
  hues = seq(15, 375, length = n+1)
  pal = hcl(h = hues, l = 65, c = 100)[1:n]
}

rm(GO_annotations)

Check sample composition



Dataset consists of 52 samples (13 ASD and 39 Controls), all extracted from the DLPFC of the brain

Sequenced using Illumina’s HiSeq 2000 (Gupta used the same, Gandal used Illumina HiSeq 2500, they are compatible).


The dataset includes 25981 genes from 52 samples.

Counts distribution: The data has already been preprocessed, so we have relatively balanced values, centered close to 0

counts = datExpr %>% melt

count_distr = data.frame('Statistic' = c('Min', '1st Quartile', 'Median', 'Mean', '3rd Quartile', 'Max'),
                         'Values' = c(min(counts$value), quantile(counts$value, probs = c(.25, .5)) %>% unname,
                                      mean(counts$value), quantile(counts$value, probs = c(.75)) %>% unname,
                                      max(counts$value)))

count_distr %>% kable(digits = 2, format.args = list(scientific = FALSE)) %>% kable_styling(full_width = F)
Statistic Values
Min -5.77
1st Quartile -1.97
Median 1.06
Mean 0.89
3rd Quartile 3.81
Max 16.47
rm(counts, count_distr)


Diagnosis distribution: There are three times more Control than ASD samples

table_info = datMeta %>% apply_labels(Diagnosis = 'Diagnosis', Sex = 'Gender')
cro(table_info$Diagnosis)
 #Total 
 Diagnosis 
   CTL  39
   ASD  13
   #Total cases  52


Gender distribution: There are thrice as many Male samples than Female ones

cro(table_info$Sex)
 #Total 
 Gender 
   Female  12
   Male  40
   #Total cases  52


Even though the gender is imbalanced, they are not biased by Diagnosis

cro(table_info$Diagnosis, list(table_info$Sex, total()))
 Gender     #Total 
 Female   Male   
 Diagnosis 
   CTL  9 30   39
   ASD  3 10   13
   #Total cases  12 40   52


Age distribution: Subjects between 2 and 69 years old with a mean of 22 years old

summary(datMeta$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    9.75   17.00   22.38   27.25   69.00
datMeta %>% ggplot(aes(Age)) +
            geom_density(alpha=0.2, aes(group = Diagnosis, fill = Diagnosis, color = Diagnosis)) +
            geom_density(alpha=0.3, fill='gray', color='gray') +
            theme_minimal()



Annotate genes with BioMart information



I was originally running this with the feb2014 version of BioMart because that’s the one that Gandal used (and it finds all of the Ensembl IDs, which other versions don’t), but it has some outdated biotype annotations, to fix this I’ll obtain all the information except the biotype label from BioMart in the same way as it had been done before, and then I’ll add the most current biotype label using information from NCBI’s website and then from BioMart in the following way:

  1. Use BioMart to run a query with the original feb2014 version using the Ensembl IDs as keys to obtain all the information except the biotype labels (we are not going to retrieve the gene name from biomart because we already extracted it from datExpr)

  2. Annotate genes with Biotype labels:

    2.1 Use the NCBI annotations downloaded from NCBI’s website and processed in NCBI/RMarkdowns/clean_data.html (there is information for only 26K genes, so some genes will remain unlabelled)

    2.2 Use the current version (jan2020) to obtain the biotype annotations using the Ensembl ID as keys (some genes don’t return a match)

    2.3 For the genes that didn’t return a match, use the current version (jan2020) to obtain the biotype annotations using the gene name as keys

    2.4 For the genes that returned multiple labels, use the feb2014 version with the Ensembl IDs as keys


Note: A small proportion of genes don’t make a match in any of these queries, so they will be lost when we start filtering out genes

labels_source = data.frame('source' = c('NCBI', 'BioMart2020_byID', 'BioMart2020_byGene', 'BioMart2014'),
                                      'n_matches' = rep(0,4))

########################################################################################
# 1. Query archive version

# Note: NCBI ID = entrez ID
getinfo = c('entrezgene','ensembl_gene_id','hgnc_symbol','chromosome_name','start_position','end_position','strand')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl', 
               host = 'feb2014.archive.ensembl.org')
datGenes_BM = getBM(attributes = getinfo, filters = c('entrezgene'), values = rownames(datExpr), 
                    mart = mart)

datGenes = datGenes %>% mutate(entrezgene = entrezgene %>% as.character %>% as.integer) %>% 
           left_join(datGenes_BM, by = 'entrezgene')

datGenes$length = datGenes$end_position - datGenes$start_position

cat(paste0('1. ', sum(is.na(datGenes$start_position)), '/', nrow(datGenes),
             ' Ensembl IDs weren\'t found in the feb2014 version of BioMart'))
## 1. 7504/29340 Ensembl IDs weren't found in the feb2014 version of BioMart
########################################################################################
########################################################################################
# 2. Get Biotype Labels

cat('2. Add biotype information')
## 2. Add biotype information
########################################################################################
# 2.1 Add NCBI annotations
datGenes = datGenes %>% left_join(NCBI_biotype, by=c('ensembl_gene_id','hgnc_symbol'))

cat(paste0('2.1 ' , sum(is.na(datGenes$gene_biotype)), '/', nrow(datGenes),
             ' Ensembl IDs weren\'t found in the NCBI database'))
## 2.1 13338/29340 Ensembl IDs weren't found in the NCBI database
labels_source$n_matches[1] = sum(!is.na(datGenes$gene_biotype))

########################################################################################
# 2.2 Query current BioMart version for gene_biotype using Ensembl ID as key

getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
               host = 'jan2020.archive.ensembl.org')
datGenes_biotype = getBM(attributes = getinfo, filters = c('ensembl_gene_id'), mart=mart, 
                         values = datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])

cat(paste0('2.2 ' , sum(is.na(datGenes$gene_biotype))-nrow(datGenes_biotype), '/', 
           sum(is.na(datGenes$gene_biotype)),
           ' Ensembl IDs weren\'t found in the jan2020 version of BioMart when querying by Ensembl ID'))
## 2.2 9667/13338 Ensembl IDs weren't found in the jan2020 version of BioMart when querying by Ensembl ID
# Add new gene_biotype info to datGenes
datGenes = datGenes %>% left_join(datGenes_biotype, by='ensembl_gene_id') %>%
           mutate(gene_biotype = coalesce(as.character(gene_biotype.x), gene_biotype.y)) %>%
           dplyr::select(-gene_biotype.x, -gene_biotype.y)

labels_source$n_matches[2] = sum(!is.na(datGenes$gene_biotype)) - labels_source$n_matches[1]

########################################################################################
# 3. Query current BioMart version for gene_biotype using gene symbol as key

missing_genes = unique(datGenes$hgnc_symbol[is.na(datGenes$gene_biotype)])
getinfo = c('hgnc_symbol','gene_biotype')
datGenes_biotype_by_gene = getBM(attributes = getinfo, filters = c('hgnc_symbol'), mart = mart,
                                 values = missing_genes)

cat(paste0('2.3 ', length(missing_genes)-length(unique(datGenes_biotype_by_gene$hgnc_symbol)),'/',
           length(missing_genes),
           ' genes weren\'t found in the current BioMart version when querying by gene name'))
## 2.3 148/1279 genes weren't found in the current BioMart version when querying by gene name
dups = unique(datGenes_biotype_by_gene$hgnc_symbol[duplicated(datGenes_biotype_by_gene$hgnc_symbol)])
cat(paste0('    ', length(dups), ' genes returned multiple labels (these won\'t be added)'))
##     3 genes returned multiple labels (these won't be added)
# Update information
datGenes_biotype_by_gene = datGenes_biotype_by_gene %>% filter(!hgnc_symbol %in% dups)
datGenes = datGenes %>% left_join(datGenes_biotype_by_gene, by='hgnc_symbol') %>% 
           mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
           dplyr::select(-gene_biotype.x, -gene_biotype.y)

labels_source$n_matches[3] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)

########################################################################################
# 4. Query feb2014 BioMart version for the missing biotypes

missing_ensembl_ids = unique(datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])

getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl', 
               host = 'feb2014.archive.ensembl.org')
datGenes_biotype_archive = getBM(attributes = getinfo, filters=c('ensembl_gene_id'), 
                                 values = missing_ensembl_ids, mart=mart)

cat(paste0('2.4 ', length(missing_ensembl_ids)-nrow(datGenes_biotype_archive),'/',length(missing_ensembl_ids),
             ' genes weren\'t found in the feb2014 BioMart version when querying by Ensembl ID'))
## 2.4 1/367 genes weren't found in the feb2014 BioMart version when querying by Ensembl ID
# Update information
datGenes = datGenes %>% left_join(datGenes_biotype_archive, by='ensembl_gene_id') %>% 
            mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
            dplyr::select(-gene_biotype.x, -gene_biotype.y)

labels_source$n_matches[4] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)

########################################################################################
# Plot results

labels_source = labels_source %>% add_row(source = 'missing', 
                                          n_matches = nrow(datGenes) - sum(labels_source$n_matches)) %>% 
                mutate(x = 1, percentage = round(100*n_matches/sum(n_matches),2),
                       source = factor(source, levels=c('BioMart2014','BioMart2020_byGene','BioMart2020_byID',
                                                        'NCBI','missing')))
                

p = labels_source %>% ggplot(aes(x, percentage, fill=source)) + geom_bar(position='stack', stat='identity') +
    theme_minimal() + coord_flip() + theme(legend.position='bottom', axis.title.y=element_blank(),
    axis.text.y=element_blank(), axis.ticks.y=element_blank()) + ylab('Percentage of genes') +
    scale_fill_manual(values = c(gg_colour_hue(nrow(labels_source)-1),'gray'))

ggplotly(p + theme(legend.position='none'))
as_ggplot(get_legend(p))

########################################################################################
# Reorder rows to match datExpr
datGenes = datGenes[match(rownames(datExpr), datGenes$entrezgene),]


rm(getinfo, mart, datGenes_BM, datGenes_biotype, datGenes_biotype_by_gene, datGenes_biotype_archive,
   dups, missing_ensembl_ids, missing_genes, labels_source, p)

Filtering



Checking how many SFARI genes are in the dataset

df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
n_SFARI = df[['gene-symbol']] %>% unique %>% length

Considering all genes, this dataset contains 786 of the 912 SFARI genes

1.- Filter entries for which we didn’t manage to obtain its genotype information

1.1 Missing Biotype

to_keep = !is.na(datGenes$gene_biotype)

datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datGenes) = datGenes$entrezgene

Removed 7504 ‘genes’, 18477 remaining

Filtering genes without biotype information, we are left with 786 SFARI Genes (we lose 0 genes)




1.2 Missing Length of the sequence

to_keep = !is.na(datGenes$length)

datExpr = datExpr[to_keep,]
datGenes = datGenes[to_keep,]

Removed 0 ‘genes’, 18477 remaining

Filtering genes without sequence length information, we are left with 786 SFARI Genes (we lose 0 genes)




2.- Filter genes that do not encode any protein


85% of the genes are protein coding genes

datGenes$gene_biotype %>% table %>% sort(decreasing=TRUE) %>% kable(caption='Biotypes of genes in dataset') %>%
                          kable_styling(full_width = F)
Biotypes of genes in dataset
. Freq
protein_coding 15790
lncRNA 1310
1 703
3 208
transcribed_unprocessed_pseudogene 133
processed_pseudogene 84
6 46
transcribed_processed_pseudogene 38
lincRNA 31
transcribed_unitary_pseudogene 28
unprocessed_pseudogene 26
pseudogene 20
antisense 18
7 12
processed_transcript 9
snoRNA 6
misc_RNA 4
sense_intronic 2
3prime_overlapping_ncrna 1
miRNA 1
polymorphic_pseudogene 1
rRNA 1
scaRNA 1
sense_overlapping 1
snRNA 1
TEC 1
TR_C_gene 1

Most of the non-protein coding genes have very low levels of expression

plot_data = data.frame('ID' = rownames(datExpr), 'MeanExpr' = apply(datExpr, 1, mean),
                       'ProteinCoding' = datGenes$gene_biotype=='protein_coding')

ggplotly(plot_data %>% ggplot(aes(MeanExpr, fill=ProteinCoding, color=ProteinCoding)) + 
         geom_density(alpha=0.5) + theme_minimal())
rm(plot_data)
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))

Filtering protein coding genes, we are left with 783 SFARI Genes (we lose 3 genes)

Note: The gene name for Ensembl ID ENSG00000187951 is wrong, it should be AC091057.1 instead of ARHGAP11B, but the biotype is right, so it would still be filtered out

n_SFARI = df[['gene-symbol']][df$gene_biotype=='protein_coding'] %>% unique %>% length

df %>% filter(!`gene-symbol` %in% df$`gene-symbol`[df$gene_biotype=='protein_coding']) %>% 
       dplyr::select(ID, `gene-symbol`, `gene-score`, gene_biotype, syndromic, `number-of-reports`) %>% 
       kable(caption='Lost Genes')  %>% kable_styling(full_width = F)
Lost Genes
ID gene-symbol gene-score gene_biotype syndromic number-of-reports
ENSG00000187951 ARHGAP11B 3 lncRNA 0 2
ENSG00000187951 ARHGAP11B 3 lncRNA 0 2
ENSG00000224639 C4B 3 lncRNA 0 6
ENSG00000233067 PTCHD1-AS 2 lncRNA 0 3
rm(df)
if(!all(rownames(datExpr)==rownames(datGenes))) cat('!!! gene rownames do not match!!!')

to_keep = datGenes$gene_biotype == 'protein_coding'
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
rownames(datExpr) = datGenes$entrezgene
rownames(datGenes) = datGenes$entrezgene


Removed 2687 genes. 15790 remaining


3.- Filter genes with low expression levels

This seems to have already been done in the original preprocessing of the data, so I won’t do it again


4.- Filter outlier samples

Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)

absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))

plot_data = data.frame('sample'=1:length(z.ku), 'distance'=z.ku, 'Sample_ID'=datMeta$Sample_ID, 
                       'Sex'=datMeta$Sex, 'Age'=datMeta$Age, 'Diagnosis'=datMeta$Diagnosis)

selectable_scatter_plot(plot_data, plot_data[,-c(1:3)])

Outlier samples: sample25, sample3, sample31

to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

rm(absadj, netsummary, ku, z.ku, plot_data)

Removed 3 samples, 49 remaining

rm(to_keep)





5.- Filter repeated genes

There are 398 genes with more than one ensembl ID in the dataset. To accurately refer to the rows of my data as ‘genes’, I’m going to remove the repeated ones.

dup_genes = datGenes$hgnc_symbol %>% duplicated

datGenes = datGenes[!dup_genes,]
datExpr = datExpr[!dup_genes,]

Removed 398 genes. 15392 remaining

782 SFARI genes remaining (we lost 783 genes)

rm(dup_genes, n_SFARI)



Save filtered and annotated dataset

save(datExpr, datMeta, datGenes, file='./../Data/filtered_raw_data.RData')
#load('./../Data/filtered_raw_data.RData')



Normalisation Exploratory Analysis



I’m going to see how much heteroscedasticity is in this dataset

Using the plotting function DESEq2’s manual proposes to study homoscedasticity, it seems like the genes with the lowest level of expression have higher SD than the rest

meanSdPlot(datExpr %>% as.matrix, plot=FALSE)$gg + theme_minimal()

Plotting points individually we can notice even more heteroscedasticity in the data

plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))

plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.2) + geom_smooth(color = 'gray') +
              scale_y_log10() + theme_minimal()

rm(plot_data)

It could be useful to work a bit more to obtain the raw counts of this dataset and perform the normalisation myself to see if I can reduce the heterocedasticity we found, but for now, this will have to do




Differential Expression Analysis



Since our data has already been preprocessed, we cannot use DESeq2 to perform the differential expression analysis. instead I’m going to use limma following the procedure in RNA-seq analysis is easy as 1-2-3 with limma, Glimma and edgeR

fit = lmFit(datExpr, design=model.matrix(~Diagnosis, data = datMeta))
efit = eBayes(fit)
DE_info = topTable(efit, sort.by = 'none', n = Inf)
## Removing intercept from test coefficients
rm(fit)




Batch Effects



This dataset was processed in a single batch, so we don’t have any batch-related variables such as processing date or processing lab


Samples don’t seem to cluster together that strongly by any variable

h_clusts = datExpr %>% t %>% dist %>% hclust %>% as.dendrogram

create_viridis_dict = function(){
  min_age = datMeta$Age %>% min
  max_age = datMeta$Age %>% max
  viridis_age_cols = viridis(max_age - min_age + 1)
  names(viridis_age_cols) = seq(min_age, max_age)
  
  return(viridis_age_cols)
}
viridis_age_cols = create_viridis_dict()

dend_meta = datMeta[match(labels(h_clusts), datMeta$title),] %>% 
            mutate('Diagnosis' = ifelse(Diagnosis=='CTL','#008080','#86b300'), # Blue control, Green ASD
                   'Sex' = ifelse(Sex=='Female','#ff6666','#008ae6'),          # Pink Female, Blue Male
                   'Age' = viridis_age_cols[as.character(Age)]) %>%            # Purple: young, Yellow: old
            dplyr::select(Age, Sex, Diagnosis)
h_clusts %>% dendextend::set('labels', rep('', nrow(datMeta))) %>% 
             dendextend::set('branches_k_color', k=9) %>% plot
colored_bars(colors=dend_meta)

rm(h_clusts, dend_meta, create_viridis_dict, viridis_age_cols)

Looking for unknown sources of batch effects


mod = model.matrix(~Diagnosis, data = datMeta %>% dplyr::select(Diagnosis, Age, Sex, Ethnicity))
mod0 = model.matrix(~1, data = datMeta %>% dplyr::select(Diagnosis, Age, Sex, Ethnicity))

sva_fit = svaseq(exp(datExpr) %>% as.matrix, mod = mod, mod0 = mod0)
## Number of significant surrogate variables is:  8 
## Iteration (out of 5 ):1  2  3  4  5
rm(mod, mod0)

Found 8 surrogate variables

Include SV estimations to datMeta information

sv_data = sva_fit$sv %>% data.frame
colnames(sv_data) = paste0('SV', 1:ncol(sv_data))

datMeta = cbind(datMeta, sv_data)


rm(sv_data, sva_fit)




Batch Effect Correction



By including the surrogate variables in the DESeq formula we only modelled the batch effects into the DEA, but we didn’t actually correct them from the data, for that we need to use ComBat (or other equivalent package) in the already normalised data

SVA surrogate variables


In some places they say you shouldn’t correct these effects on the data because you risk losing biological variation, in others they say you should because they introduce noise to the data. The only thing everyone agrees on is that you shouldn’t remove them before performing DEA but instead include them in the model.

Based on the conclusions from Practical impacts of genomic data “cleaning” on biological discovery using surrogate variable analysis it seems like it may be a good idea to remove the batch effects from the data and not only from the DE analysis:

  • Using SVA, ComBat or related tools can increase the power to identify specific signals in complex genomic datasets (they found “greatly sharpened global and gene-specific differential expression across treatment groups”)

  • But caution should be exercised to avoid removing biological signal of interest

  • We must be precise and deliberate in the design and analysis of experiments and the resulting data, and also mindful of the limitations we impose with our own perspective

  • Open data exploration is not possible after such supervised “cleaning”, because effects beyond those stipulated by the researcher may have been removed

Comparing data with and without surrogate variable correction

# Taken from https://www.biostars.org/p/121489/#121500
correctDatExpr = function(datExpr, mod, svs) {
  X = cbind(mod, svs)
  Hat = solve(t(X) %*% X) %*% t(X)
  beta = (Hat %*% t(datExpr))
  rm(Hat)
  gc()
  P = ncol(mod)
  return(datExpr - t(as.matrix(X[,-c(1:P)]) %*% beta[-c(1:P),]))
}

pca_samples_before = datExpr %>% t %>% prcomp
pca_genes_before = datExpr %>% prcomp

# Correct
mod = model.matrix(~ Diagnosis, data = datMeta)
svs = datMeta %>% dplyr::select(contains('SV')) %>% as.matrix
datExpr_corrected = correctDatExpr(as.matrix(datExpr), mod, svs)

pca_samples_after = datExpr_corrected %>% t %>% prcomp
pca_genes_after = datExpr_corrected %>% prcomp


rm(correctDatExpr)

Samples


Removing batch effects has a big impact in the distribution of the samples, separating them by diagnosis relatively well

This time, both PC1 and PC2 seem to play a role in separating the samples by Diagnosis instead of just the 1st PC as we had seen in Gandal

pca_samples_df = rbind(data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_before$x[,1],
                                  'PC2'=pca_samples_before$x[,2], 'corrected'=0),
                       data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_after$x[,1],
                                  'PC2'=pca_samples_after$x[,2], 'corrected'=1)) %>%
                 left_join(datMeta %>% mutate('ID'=datMeta$title), by='ID')

ggplotly(pca_samples_df %>% ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point(aes(frame=corrected, id=ID), alpha=0.75) + 
         xlab(paste0('PC1 (corr=', round(cor(pca_samples_before$x[,1],pca_samples_after$x[,1]),2),
                     '). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,1],1),' to ',
                     round(100*summary(pca_samples_after)$importance[2,1],1))) +
         ylab(paste0('PC2 (corr=', round(cor(pca_samples_before$x[,2],pca_samples_after$x[,2]),2),
                     '). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,2],1),' to ',
                     round(100*summary(pca_samples_after)$importance[2,2],1))) +
         ggtitle('Samples') + theme_minimal())
rm(pca_samples_df)


Genes


It seems like the sva correction preserves the mean expression of the genes and erases almost everything else (although what little else remains is enough to characterise the two Diagnosis groups pretty well using only the first PC)

Genes with lower levels of expression have much higher variances in the 2nd principal components than the rest of the genes

*Plot is done with only 10% of the genes so it’s not that heavy

pca_genes_df = rbind(data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_before$x[,1],
                                'PC2'=pca_genes_before$x[,2], 'corrected'=0, 'MeanExpr'=rowMeans(datExpr)),
                     data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_after$x[,1],
                                'PC2'=pca_genes_after$x[,2], 'corrected'=1, 'MeanExpr'=rowMeans(datExpr)))

keep_genes = rownames(datExpr) %>% sample(0.1*nrow(datExpr))

pca_genes_df = pca_genes_df %>% filter(ID %in% keep_genes)

ggplotly(pca_genes_df %>% ggplot(aes(PC1, PC2,color=MeanExpr)) + 
         geom_point(alpha=0.3, aes(frame=corrected, id=ID)) +
         xlab(paste0('PC1 (corr=', round(cor(pca_genes_before$x[,1],pca_genes_after$x[,1]),2),
                     '). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,1],1),' to ',
                     round(100*summary(pca_genes_after)$importance[2,1],1))) +
         ylab(paste0('PC2 (corr=', round(cor(pca_genes_before$x[,2],pca_genes_after$x[,2]),2),
                     '). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,2],1),' to ',
                     round(100*summary(pca_genes_after)$importance[2,2],1))) +
         scale_color_viridis() + ggtitle('Genes') + theme_minimal())
rm(pca_samples_before, pca_genes_before, mod, svs, pca_samples_after, pca_genes_after, pca_genes_df, keep_genes)

Everything looks good, so we’re keeping the corrected expression dataset

datExpr = datExpr_corrected

rm(datExpr_corrected)




Save preprocessed dataset

save(datExpr, datMeta, datGenes, DE_info, efit, file='./../Data/preprocessed_data.RData')




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] kableExtra_1.1.0       knitr_1.28             expss_0.10.2          
##  [4] dendextend_1.13.4      limma_3.40.6           GEOquery_2.52.0       
##  [7] vsn_3.52.0             Biobase_2.44.0         BiocGenerics_0.30.0   
## [10] WGCNA_1.69             fastcluster_1.1.25     dynamicTreeCut_1.63-1 
## [13] sva_3.32.1             BiocParallel_1.18.1    genefilter_1.66.0     
## [16] mgcv_1.8-31            nlme_3.1-147           biomaRt_2.40.5        
## [19] ggpubr_0.2.5           magrittr_1.5           ggExtra_0.9           
## [22] GGally_1.5.0           gridExtra_2.3          viridis_0.5.1         
## [25] viridisLite_0.3.0      RColorBrewer_1.1-2     plotlyutils_0.0.0.9000
## [28] plotly_4.9.2           glue_1.4.1             reshape2_1.4.4        
## [31] forcats_0.5.0          stringr_1.4.0          dplyr_1.0.0           
## [34] purrr_0.3.4            readr_1.3.1            tidyr_1.1.0           
## [37] tibble_3.0.1           ggplot2_3.3.2          tidyverse_1.3.0       
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1          backports_1.1.8       Hmisc_4.4-0          
##   [4] plyr_1.8.6            lazyeval_0.2.2        splines_3.6.3        
##   [7] crosstalk_1.1.0.1     digest_0.6.25         foreach_1.5.0        
##  [10] htmltools_0.4.0       GO.db_3.8.2           fansi_0.4.1          
##  [13] checkmate_2.0.0       memoise_1.1.0         cluster_2.1.0        
##  [16] doParallel_1.0.15     annotate_1.62.0       modelr_0.1.6         
##  [19] matrixStats_0.56.0    prettyunits_1.1.1     jpeg_0.1-8.1         
##  [22] colorspace_1.4-1      blob_1.2.1            rvest_0.3.5          
##  [25] haven_2.2.0           xfun_0.12             hexbin_1.28.1        
##  [28] crayon_1.3.4          RCurl_1.98-1.2        jsonlite_1.7.0       
##  [31] impute_1.58.0         survival_3.1-12       iterators_1.0.12     
##  [34] gtable_0.3.0          zlibbioc_1.30.0       webshot_0.5.2        
##  [37] scales_1.1.1          DBI_1.1.0             miniUI_0.1.1.1       
##  [40] Rcpp_1.0.4.6          xtable_1.8-4          progress_1.2.2       
##  [43] htmlTable_1.13.3      foreign_0.8-76        bit_1.1-15.2         
##  [46] preprocessCore_1.46.0 Formula_1.2-3         stats4_3.6.3         
##  [49] htmlwidgets_1.5.1     httr_1.4.1            acepack_1.4.1        
##  [52] ellipsis_0.3.1        farver_2.0.3          pkgconfig_2.0.3      
##  [55] reshape_0.8.8         XML_3.99-0.3          nnet_7.3-14          
##  [58] dbplyr_1.4.2          labeling_0.3          tidyselect_1.1.0     
##  [61] rlang_0.4.6           later_1.0.0           AnnotationDbi_1.46.1 
##  [64] munsell_0.5.0         cellranger_1.1.0      tools_3.6.3          
##  [67] cli_2.0.2             generics_0.0.2        RSQLite_2.2.0        
##  [70] broom_0.5.5           evaluate_0.14         fastmap_1.0.1        
##  [73] yaml_2.2.1            bit64_0.9-7           fs_1.4.0             
##  [76] mime_0.9              xml2_1.2.5            compiler_3.6.3       
##  [79] rstudioapi_0.11       curl_4.3              png_0.1-7            
##  [82] affyio_1.54.0         ggsignif_0.6.0        reprex_0.3.0         
##  [85] stringi_1.4.6         highr_0.8             lattice_0.20-41      
##  [88] Matrix_1.2-18         vctrs_0.3.1           pillar_1.4.4         
##  [91] lifecycle_0.2.0       BiocManager_1.30.10   cowplot_1.0.0        
##  [94] data.table_1.12.8     bitops_1.0-6          httpuv_1.5.2         
##  [97] affy_1.62.0           R6_2.4.1              latticeExtra_0.6-29  
## [100] promises_1.1.0        IRanges_2.18.3        codetools_0.2-16     
## [103] assertthat_0.2.1      withr_2.2.0           S4Vectors_0.22.1     
## [106] hms_0.5.3             grid_3.6.3            rpart_4.1-15         
## [109] rmarkdown_2.1         shiny_1.4.0.2         lubridate_1.7.4      
## [112] base64enc_0.1-3